load("Normal_GLM.rdata")
source("Normal_GLM.r")

plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1

## [1] 15.59637
## [1] 10.63484
## [1] 6.27827
## [1] 2.279274
## [1] 0.3134914
## [1] 0.00670344
## [1] 3.489446e-06
## [1] 5.542233e-13
## [1] 3.552714e-14
## [1] 1.563194e-12
## [1] 0
## (Intercept)           x          x2 
##    11.45196    43.89866   -42.74408 
## (Intercept)           x          x2 
##     0.68391    17.95136   -17.74880 
## [1] 0

plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1plot of chunk unnamed-chunk-1

library(carData)
library(car)

# (a)
xsquared = x^2
fit = lm(y~x+xsquared)
summary(fit)
## 
## Call:
## lm(formula = y ~ x + xsquared)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9940 -0.6093  0.0508  0.5487  4.1157 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.4348     0.5395   19.34  < 2e-16 ***
## x           -27.1207     2.2597  -12.00 6.46e-16 ***
## xsquared     27.5193     2.0916   13.16  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.08 on 47 degrees of freedom
## Multiple R-squared:  0.793,  Adjusted R-squared:  0.7842 
## F-statistic: 90.01 on 2 and 47 DF,  p-value: < 2.2e-16
# plot(fit)

# Test whether the quadratic term is significant... p-value is in the printout though
linearHypothesis(fit, "xsquared = 0")
## Linear hypothesis test
## 
## Hypothesis:
## xsquared = 0
## 
## Model 1: restricted model
## Model 2: y ~ x + xsquared
## 
##   Res.Df     RSS Df Sum of Sq      F    Pr(>F)    
## 1     48 256.521                                  
## 2     47  54.776  1    201.75 173.11 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# fitH0 = lm(y~x)
# SSres = sum(fit$residuals^2)
# SSresH0 = sum(fitH0$residuals^2)
# 
# F = ((SSresH0 - SSres) / (SSres)) * ((50-3)/(1))
# 1-pf(F, 1, 50-3)

# Plot the residuals and decide if the data is homoscedastic.... Possibly heteroscedastic...
plot(fit$residuals)

plot of chunk unnamed-chunk-1

# (b) calculating the log-likelihood 

beta1 = fit$coef / var(y)  # eta_1 is mu_1 / sigma_i^2 and in the earlier model the sigma is constant (sample variance y)
beta2 = c(1 / var(y), 0, 0)  # eta_2 is 1/sigma_i^2 and in the earlier model sigma is constant and doesn't depend on x at all
ones = rep(1, length(x))
X1 = cbind(ones, x, x^2) 
X2 = cbind(ones, x, x^2)
lik(beta1, beta2, y, X1, X2)
## [1] -93.17752
# gradient 
Dlik(beta1, beta2, y, X1, X2)  # note the gradient for eta_1 is very small, for eta_2 the gradient is way off. The way to improve is to change the variance estimates to depend on x.  We are currently at a maximum where the sigma doesn't depend on x
##  (Intercept)            x     xsquared         <NA>         <NA> 
## 2.273737e-13 1.421085e-13 5.684342e-14 1.076038e+02 5.057183e+01 
##         <NA> 
## 2.345618e+01
# two explanations:  The model in part (a) is a submodel where beta_2,1 = beta_2,2 = 0 and have the tranformations above for the betas.
# Gradient explanation: the gradient for eta_1 is very small, for eta_2 the gradient is way off. The way to improve is to change the variance estimates to depend on x.  We are currently at a maximum where the sigma doesn't depend on x


# (c) Optimizing

# Optimizing with Newton-Raphson
NewtonRaphsonOptim = function(currentBeta) {
  while (TRUE) {
    l = lik(currentBeta[1:3], currentBeta[4:6], y, X1, X2)
    Dl = Dlik(currentBeta[1:3], currentBeta[4:6], y, X1, X2)
    D2l = D2lik(currentBeta[1:3], currentBeta[4:6], y, X1, X2)

    if (sum(Dl^2) < 10^(-8)) {
      return(currentBeta)
    }

    hm = -solve(D2l) %*% Dl
    newBeta = currentBeta + hm
    newl = lik(newBeta[1:3], newBeta[4:6], y, X1, X2)

    while (is.nan(newl) || newl <= l) {
      hm = hm / 2
      newBeta = currentBeta + hm
      newl = lik(newBeta[1:3], newBeta[4:6], y, X1, X2)
    }

    currentBeta = newBeta
  }
}

betaMLE = NewtonRaphsonOptim(c(beta1, beta2))
betaMLE
##                  [,1]
## (Intercept)  11.45196
## x            43.89866
## xsquared    -42.74408
##               0.68391
##              17.95136
##             -17.74880
# Optimizing with optim()
optimized = optim(c(beta1, beta2), 
      function(betaC) {lik(betaC[1:3], betaC[4:6], y, X1, X2)}, 
      gr=function(betaC) {Dlik(betaC[1:3], betaC[4:6], y, X1, X2)},
      method="BFGS", # This is a "quasi-Newton method (variable metric algorithm)" which takes advantage of the gradient 
      control=list(fnscale=-1))  # Maximize rather than minimize
## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced
# NOTE: I think R essentially filtered out the inappropriate beta2 values (due to NaN from the log(..)), which is why it throws these warnings
# optimized$par[4] + x*optimized$par[5] + x^2*optimized$par[6]  # The estimated beta2 makes sense though
optimized
## $par
## (Intercept)           x    xsquared                                     
##   11.452782   43.152789  -42.015401    0.686633   17.779954  -17.579979 
## 
## $value
## [1] -38.14069
## 
## $counts
## function gradient 
##      175      100 
## 
## $convergence
## [1] 1
## 
## $message
## NULL
# Plot of data vs predictions from standard linear model
plot(x, y, col= "black", ylim=c(0,16))
predictedFromSLM = fit$coef[1] + fit$coef[2]*x + fit$coef[3]*(x^2) 
points(x, predictedFromSLM, col="#CC79A7") # pink

estimatedSigmaSq = 1/(betaMLE[4] + betaMLE[5]*x + betaMLE[6]*(x^2))
predictedFromGLM = (betaMLE[1] + betaMLE[2]*x + betaMLE[3]*(x^2)) * estimatedSigmaSq
points(x, predictedFromGLM, col="#0072B2") # blue

plot of chunk unnamed-chunk-1

residualsGLM = predictedFromGLM - y
plot(x, residualsGLM, ylim=c(-1.5,1.5))
#lines(x, residualsGLM+sd(residualsGLM))
#lines(x, residualsGLM-sd(residualsGLM))
#lines(x, residualsGLM+sqrt(estimatedSigmaSq), col="#0072B2")
#lines(x, residualsGLM-sqrt(estimatedSigmaSq), col="#0072B2")
lines(x, sqrt(estimatedSigmaSq), col="#0072B2")
lines(x, -sqrt(estimatedSigmaSq), col="#0072B2")

plot of chunk unnamed-chunk-1

sqrt(estimatedSigmaSq)+residualsGLM
##  [1]  0.549529631  1.174919419  1.384026344  0.267883203  0.279101027
##  [6]  1.024177882  0.696313827  0.003416554  0.057145592  0.500768728
## [11] -0.194700852  0.538667581  0.399879820  0.892568680  0.245980870
## [16]  1.267327223  1.055431430  0.463237299  0.574406300 -0.131822446
## [21]  0.153859895  0.830536622  0.339394687  1.161379297  0.304933259
## [26] -0.094316573  0.601009163  0.718680660  0.572312482  1.545794462
## [31]  0.615642275  0.508539597 -0.124906773 -0.076105326 -0.039963368
## [36]  0.816250108 -0.257427489 -0.046640508  1.438289093  0.741633714
## [41] -0.310608441  0.113386481  0.297734621  1.115963483 -0.074063299
## [46]  0.142160212  1.550847205  1.566033415  1.503599687  0.127938509
residualsSLM = predictedFromSLM - y
plot(x, residualsSLM, ylim=c(-5,4))
#lines(x, residualsSLM+1.08)
#lines(x, residualsSLM-1.08)
lines(x, rep(1.08, length(x)), col="#0072B2")
lines(x, rep(-1.08, length(x)), col="#0072B2")

plot of chunk unnamed-chunk-1

# (d) 
# test for quadratic term in eta_1 
NROptimNoQuadEta1 = function(currentBeta) {
  finished = FALSE
  while (!finished) {
    l = lik(currentBeta[1:2], currentBeta[3:5], y, X1[,1:2], X2)
    Dl = Dlik(currentBeta[1:2], currentBeta[3:5], y, X1[,1:2], X2)
    D2l = D2lik(currentBeta[1:2], currentBeta[3:5], y, X1[,1:2], X2)

    if (sum(Dl^2) < 10^(-8)) {
      return(currentBeta)
    }

    hm = -solve(D2l) %*% Dl
    newBeta = currentBeta + hm
    newl = lik(newBeta[1:2], newBeta[3:5], y, X1[,1:2], X2)

    while (is.nan(newl) || newl <= l) {
      hm = hm / 2
      newBeta = currentBeta + hm
      newl = lik(newBeta[1:2], newBeta[3:5], y, X1[,1:2], X2)
    }

    currentBeta = newBeta
  }
  return(currentBeta)
}

H0betaMLE = NROptimNoQuadEta1(c(betaMLE[1:2], betaMLE[4:6]))
## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced
H0betaMLE
##              [,1]
## [1,] 14.912839174
## [2,]  0.002674225
## [3,]  1.118181109
## [4,]  9.949501118
## [5,] -9.914566856
dev1 = -2*lik(betaMLE[1:3], betaMLE[4:6], y, X1, X2)
dev0 = -2*lik(H0betaMLE[1:2], H0betaMLE[3:5], y, X1[,1:2], X2)

1-pchisq(dev0 - dev1, df=1)
## [1] 0.0002425917
dev0-dev1
## [1] 13.46858
# test for quadratic term in eta_2 
NROptimNoQuadEta2 = function(currentBeta) {
  finished = FALSE
  while (!finished) {
    l = lik(currentBeta[1:3], currentBeta[4:5], y, X1, X2[,1:2])
    Dl = Dlik(currentBeta[1:3], currentBeta[4:5], y, X1, X2[,1:2])
    D2l = D2lik(currentBeta[1:3], currentBeta[4:5], y, X1, X2[,1:2])

    if (sum(Dl^2) < 10^(-8)) {
      return(currentBeta)
    }

    hm = -solve(D2l) %*% Dl
    newBeta = currentBeta + hm
    newl = lik(newBeta[1:3], newBeta[4:5], y, X1, X2[,1:2])

    while (is.nan(newl) || newl <= l) {
      hm = hm / 2
      newBeta = currentBeta + hm
      newl = lik(newBeta[1:3], newBeta[4:5], y, X1, X2[,1:2])
    }

    currentBeta = newBeta
  }
  return(currentBeta)
}

H0betaMLE = NROptimNoQuadEta2(c(betaMLE[1:3], betaMLE[4:5]))
## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced

## Warning in log(X2 %*% beta2): NaNs produced
H0betaMLE
##             [,1]
## [1,]   9.9645446
## [2,] -25.5552970
## [3,]  25.0106087
## [4,]   0.9747917
## [5,]  -0.1166871
dev1 = -2*lik(betaMLE[1:3], betaMLE[4:6], y, X1, X2)
dev0 = -2*lik(H0betaMLE[1:3], H0betaMLE[4:5], y, X1, X2[,1:2])

1-pchisq(dev0 - dev1, df=1)
## [1] 1.110223e-16
# Test for GLM model vs SLM
dev1 = -2*lik(betaMLE[1:3], betaMLE[4:6], y, X1, X2)
dev0 = -2*lik(beta1, beta2, y, X1, X2)

1-pchisq(dev0 - dev1, df=2)
## [1] 0
# (e)
predictionSimulation = function(xVal, numTrials) {
  mseSLMmean = 0
  mseGLMmean = 0

  mseSLMsd = 0
  mseGLMsd = 0

  # The true values of the mean and variance from the model in (c)
  trueVariances = estimatedSigmaSq
  trueMeans = predictedFromGLM
  # The true values for the xValue from the model in (c)
  modelVar = 1/(betaMLE[4] + betaMLE[5]*xVal + betaMLE[6]*(xVal^2))
  modelSd = sqrt(modelVar)
  modelMean = (betaMLE[1] + betaMLE[2]*xVal + betaMLE[3]*(xVal^2)) * modelVar

  for (trial in 1:numTrials) {
    simulatedY = rnorm(50, mean=trueMeans, sd=sqrt(trueVariances))
    newSLMfit = lm(simulatedY~x+xsquared)
    newSLMPrediction = newSLMfit$coef[[1]] + newSLMfit$coef[[2]]*xVal + newSLMfit$coef[[3]]*xVal^2
    newSLMsd = sd(newSLMfit$residuals)

    newBeta1 = newSLMfit$coef / var(simulatedY)  
    newBeta2 = c(1 / var(simulatedY), 0, 0)  
    newBetaMLE = NewtonRaphsonOptim(c(newBeta1, newBeta2))

    newGLMEstVariance = 1/(newBetaMLE[4] + newBetaMLE[5]*xVal + newBetaMLE[6]*(xVal^2))
    newPredictedY = (newBetaMLE[1] + newBetaMLE[2]*xVal 
                     + newBetaMLE[3]*(xVal^2)) * newGLMEstVariance
    newGLMsd = sqrt(newGLMEstVariance)

    mseSLMmean = mseSLMmean + (newSLMPrediction - modelMean)^2
    mseGLMmean = mseGLMmean + (newPredictedY - modelMean)^2

    mseSLMsd = mseSLMsd + (newSLMsd - modelSd)^2
    mseGLMsd = mseGLMsd + (newGLMsd - modelSd)^2
  }

  mseSLMmean = (1/numTrials) * mseSLMmean
  mseGLMmean = (1/numTrials) * mseGLMmean

  mseSLMsd = (1/numTrials) * mseSLMsd
  mseGLMsd = (1/numTrials) * mseGLMsd

  return(c(mseSLMmean, mseGLMmean, mseSLMsd, mseGLMsd))
}
predictionSimulation(0.75, 100)
## [1] 4.779416e-01 3.737011e-16 3.058965e-01 7.203965e-13